'''
The purpose of this script is to correlate the protest size estimated from Twitter with size recorded in newspapers.  The newspaper records were catalogued by undergraduates working for me.
'''
##########################
##
##	GLOBALS
##
##########################
library(dplyr) # For aggregation
library(reshape)
library(ggplot2)
library(scales)  # For logged axis labels
library(olsrr)  # For residual tests

#setwd('<path/to/Replication/>')


# dv = string of DV name
# iv = string of IV name
residualCalculation <- function(dv, iv, data, fileCustomizer){
	dv <- data[[dv]]
	iv <- data[[iv]]
	m1 <- lm(log(dv+1,10)~log(iv+1,10), data=data)

	test <- ols_test_normality(m1)

	jpeg(filename=paste0('./Figures/dvVerify_residualFit_', fileCustomizer, '.jpeg'), width=5, height=5, res=300, units='in')
	par(mar=c(5, 5, 1, 1))
	ols_plot_resid_fit2(m_seoul1)
	dev.off()

	return(test)
}


ols_plot_resid_fit2 <- function(model){
    #check_model(model)  # Cannot find this function
    predicted <- NULL
    resid <- NULL
    d <- rvspdata(model)
    p <- ggplot(d, aes(x = predicted, y = resid)) + geom_point(shape = 20, size=4) + xlab("Non-Image Estimate") + ylab("Residual Using Image Estimate") +
        ggtitle("") + geom_hline(yintercept = 0,
        linetype='dashed') + theme_classic() + theme(panel.grid.minor = element_blank(),panel.grid.major=element_blank(), panel.border=element_blank(), axis.line=element_line(color='black'), axis.title.x = element_text(size=16),axis.text.x=element_text(size=16),axis.title.y = element_text(size=16),axis.text.y=element_text(size=16))
    print(p)
	}


# See https://rdrr.io/cran/olsrr/src/R/ols-residual-vs-predicted-plot.R
rvspdata <- function(model) {
  resid     <- residuals(model)
  predicted <- fitted(model)
  data.frame(predicted = predicted, resid = resid)
}



##########################
##
##	DATA
##
##########################

######
# TWITTER
######
countryday <- read.csv('./Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays.csv', stringsAsFactors=FALSE)

countryday$country <- countryday$place.country_code

long <- melt(countryday, id.vars=c('country', 'city_use', 'day'), measure.vars='totalFaces_Sum', variable_name='Measure')
names(long)[names(long) == 'value'] <- 'totalFaces_Sum'

##### SUBSET
tw_seoul <- long[long$city_use == 'Seoul',]
tw_barcelona <- long[long$city_use == 'Barcelona' & long$country == 'ES',]
tw_hk <- long[long$city_use == 'Central',]
tw_caracas <- long[long$city_use == 'Caracas',]


######
# MANUAL
######

##### READ DATA
#The data on the Korean attendance is from this page: https://ko.wikipedia.org/wiki/%EB%B0%95%EA%B7%BC%ED%98%9C_%EB%8C%80%ED%86%B5%EB%A0%B9_%ED%87%B4%EC%A7%84_%EC%9A%B4%EB%8F%99.  The first numeric column is the police's estimate; the second, organizers'; and the third, the number of police.
seoul <- data.frame(date=c('2016-10-29', '2016-11-05', '2016-11-12', '2016-11-19', '2016-11-26', '2016-11-30', '2016-12-03', '2016-12-10', '2016-12-17', '2016-12-24', '2016-12-31', '2017-01-07', '2017-01-14', '2017-01-21', '2017-02-04', '2017-02-11', '2017-02-18', '2017-02-25', '2017-03-01', '2017-03-04', '2017-03-11', '2017-03-25', '2017-04-15', '2017-04-29'), protesters_police = c(12000, 48000, 269000, 275000, 330000, 8000, 429000, 166000, 77000, 53000, 83000, 38000, rep(NA, 12)), protesters_activists=c(50000, 300000, 1060000, 960000, 1900000, 60000, 2320000, 1043400, 771750, 702000, 1104000, 643380, 146700, 353400, 425500, 806270, 844860, 1078130, 300000, 1050890, 720000, 102400, 109600, 50000), police_amount=c(4800, 17600, 25000, 20000, 25000, 8000, 30000, 18200, 18240, 14700, 18400, 15920, 14700, 15500, 14600, 16000, 15200, 17000, 16000, 15900, 38100, 12300, NA, NA))

barcelona <- read.csv('./Data/manualProtestCoding/ProtestSizeTracking-Barcelona.csv', stringsAsFactors=FALSE)
hk <- read.csv('./Data/manualProtestCoding/ProtestSizeTracking-HongKong_v2.csv', stringsAsFactors=FALSE)
caracasMMAD <- read.csv('./Data/manualProtestCoding/ProtestSizeTracking-Caracas_MMAD.csv', stringsAsFactors=FALSE)  # This also already has the Twitter estimates
caracasFR <- read.csv('./Data/manualProtestCoding/ProtestSizeTracking-Caracas_FranciscoRodriguez.csv', stringsAsFactors=FALSE)
caracasFR <- caracasFR[,c('Date', 'Opposition')]
caracasFR$Opposition <- gsub(',', '', caracasFR$Opposition)

##### MAKE DAY
barcelona$month <- sapply(barcelona$protest_date, function(x) strsplit(x, '/')[[1]][1])
barcelona$dayOfMonth <- sapply(barcelona$protest_date, function(x) strsplit(x, '/')[[1]][2])
barcelona$year <- sapply(barcelona$protest_date, function(x) strsplit(x, '/')[[1]][3])
barcelona$day <- paste(barcelona$year, barcelona$month, barcelona$dayOfMonth, sep='-')

hk$month <- sapply(hk$protest_date, function(x) strsplit(x, '/')[[1]][1])
hk$dayOfMonth <- sapply(hk$protest_date, function(x) strsplit(x, '/')[[1]][2])
hk$year <- sapply(hk$protest_date, function(x) strsplit(x, '/')[[1]][3])
hk$day <- paste(hk$year, hk$month, hk$dayOfMonth, sep='-')

##### MAKE SIZE NUMERIC
# Change name in case I later want to do something with original variable
barcelona$num_participants2 <- as.numeric(barcelona$num_participants)
hk$num_participants2 <- as.numeric(hk$num_participants)
caracasFR$Opposition <- as.numeric(caracasFR$Opposition)

##### AGGREGATE BY DAY
barcelona_agg <- data.frame(barcelona %>% group_by(day) %>% summarize(size=mean(num_participants2, na.rm=TRUE)))
hk_agg <- data.frame(hk %>% group_by(day) %>% summarize(size=mean(num_participants2, na.rm=TRUE)))

######
# Merge
######

##### SEOUL
seoul_merged <- merge(tw_seoul, seoul, by.x='day', by.y='date')

##### BARCELONA
barcelona_merged <- merge(tw_barcelona[,c('day', 'totalFaces_Sum')], barcelona_agg[,c('day', 'size')], by.x='day', by.y='day', all.x=TRUE, all.y=FALSE)

##### HONG KONG
hk_merged <- merge(tw_hk[,c('day', 'totalFaces_Sum')], hk_agg[,c('day', 'size')], by.x='day', by.y='day', all.x=TRUE, all.y=FALSE)

#### CARACAS
caracas_merged <- merge(tw_caracas[,c('day', 'totalFaces_Sum')], caracasFR[,c('Date', 'Opposition')], by.x='day', by.y='Date', all.x=TRUE, all.y=FALSE)
# Add results from MMAD
caracas_merged <- merge(caracas_merged, caracasMMAD[,c('Event_date', 'Size.y')], by.x='day', by.y='Event_date', all.x=TRUE, all.y=FALSE)
caracas_merged$size <- ifelse(is.na(caracas_merged$Size.y) == FALSE, caracas_merged$Size.y, caracas_merged$Opposition)

##########################
##
##  GET CORRELATIONS
##
##########################

######
# SEOUL
######
long2 <- melt(seoul_merged, id.vars=c('totalFaces_Sum'), measure.vars=c('protesters_police', 'protesters_activists'), variable_name='Estimate')

# Correlation of face count, police estimate
one=cor(seoul_merged$totalFaces_Sum, seoul_merged$protesters_police, use='complete.obs')

# Correlation of face count, activists estimate
two=cor(seoul_merged$totalFaces_Sum, seoul_merged$protesters_activists, use='complete.obs')

# Logged, correlation of face count, police estimate
five=cor(log(seoul_merged$totalFaces_Sum, 10), log(seoul_merged$protesters_police, 10), use='complete.obs')

# Logged, correlation of face count, activists estimate
six=cor(log(seoul_merged$totalFaces_Sum, 10), log(seoul_merged$protesters_activists, 10), use='complete.obs')



######
# BARCELONA
######
# Correlation of face count, police estimate
one=cor(barcelona_merged$totalFaces_Sum, barcelona_merged$size, use='complete.obs')

# Logged, correlation of face count, police estimate
five=cor(log(barcelona_merged$totalFaces_Sum, 10), log(barcelona_merged$size, 10), use='complete.obs')


######
# HONG KONG
######
# Correlation of face count, police estimate
one=cor(hk_merged$totalFaces_Sum, hk_merged$size, use='complete.obs')

# Logged, correlation of face count, police estimate
five=cor(log(hk_merged$totalFaces_Sum+1, 10), log(hk_merged$size+1, 10), use='complete.obs')


######
# CARACAS
######
# Correlation of face count, police estimate
caracas_merged <- subset(caracas_merged, day >= '2016-01-01')

one=cor(caracas_merged$totalFaces_Sum, caracas_merged$size, use='complete.obs')

# Logged, correlation of face count, police estimate
five=cor(log(caracas_merged$totalFaces_Sum+1, 10), log(caracas_merged$size+1, 10), use='complete.obs')


##########################
##
##  TEST RESIDUALS
##
##########################
### The returned objects contain the S-W and K-S values used in those columns in Table 3.  It also generates images used for Figure A12.
result_seoul1 <- residualCalculation(dv='protesters_police', iv='totalFaces_Sum', data=seoul_merged, fileCustomizer='seoulPoliceEstimate')
result_seoul2 <- residualCalculation(dv='protesters_activists', iv='totalFaces_Sum', data=seoul_merged, fileCustomizer='seoulActivistEstimate')
result_barca <- residualCalculation(dv='size', iv='totalFaces_Sum', data=rbind(barcelona_merged, barcelona_merged), fileCustomizer='barcelona')  # NB: Only 5 matches and get an error saying need at least 7, so I double
result_hk <- residualCalculation(dv='size', iv='totalFaces_Sum', data=hk_merged, fileCustomizer='hongkong')
result_caracas <- residualCalculation(dv='size', iv='totalFaces_Sum', data=caracas_merged, fileCustomizer='caracas')
